In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from constants import INF, SPATIAL, MORPHOLOGICAL, TEMPORAL 
In [2]:
PATH = 'clustersData/0'
In [3]:
df = None
files = os.listdir(PATH)
for file in sorted(files):
    if df is None:
        df = pd.read_csv(PATH + '/' + file)
    else:
        temp = pd.read_csv(PATH + '/' + file)
        df = df.append(temp)
In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1063 entries, 0 to 0
Data columns (total 33 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   dep_red                   1063 non-null   float64
 1   dep_sd                    1063 non-null   float64
 2   hyp_red                   1063 non-null   float64
 3   hyp_sd                    1063 non-null   float64
 4   graph_avg_speed           1063 non-null   float64
 5   graph_slowest_path        1063 non-null   float64
 6   graph_fastest_path        1063 non-null   float64
 7   geometrical_avg_shift     1063 non-null   float64
 8   geometrical_shift_sd      1063 non-null   float64
 9   geometrical_max_dist      1063 non-null   float64
 10  spatial_dispersion_count  1063 non-null   float64
 11  spatial_dispersion_sd     1063 non-null   float64
 12  da                        1063 non-null   float64
 13  da_sd                     1063 non-null   float64
 14  Channels contrast         1063 non-null   float64
 15  break_measure             1063 non-null   float64
 16  fwhm                      1063 non-null   float64
 17  get_acc                   1063 non-null   float64
 18  max_speed                 1063 non-null   float64
 19  peak2peak                 1063 non-null   float64
 20  trough2peak               1063 non-null   float64
 21  rise_coef                 1063 non-null   float64
 22  smile_cry                 1063 non-null   float64
 23  d_kl                      1063 non-null   float64
 24  jump                      1063 non-null   float64
 25  psd_center                1063 non-null   float64
 26  der_psd_center            1063 non-null   float64
 27  rise_time                 1063 non-null   float64
 28  unif_dist                 1063 non-null   float64
 29  num_spikes                1063 non-null   float64
 30  max_abs                   1063 non-null   float64
 31  name                      1063 non-null   object 
 32  label                     1063 non-null   float64
dtypes: float64(32), object(1)
memory usage: 282.4+ KB
In [5]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
df.head()
Out[5]:
dep_red dep_sd hyp_red hyp_sd graph_avg_speed graph_slowest_path graph_fastest_path geometrical_avg_shift geometrical_shift_sd geometrical_max_dist spatial_dispersion_count spatial_dispersion_sd da da_sd Channels contrast break_measure fwhm get_acc max_speed peak2peak trough2peak rise_coef smile_cry d_kl jump psd_center der_psd_center rise_time unif_dist num_spikes max_abs name label
0 2.75 0.707107 2.75 0.707107 68.149943 10.965856 262.164746 0.211559 0.124836 29.826142 2.0 0.280689 936.0 1.466900 0.0 -331.844968 36.0 14.250416 35.0 734.301418 80.0 59.0 -11.477378 0.376905 502.096897 11.360348 6.023163 243.0 -0.096432 55919.0 555.943579 es04feb12_1_1_2 -1.0
0 195.00 9.205976 195.00 9.205976 7.660231 2.444956 52.125445 0.404714 0.320027 58.920182 1.0 0.238903 290.0 0.849948 0.0 -1008.448141 36.0 86.991241 37.0 1454.894065 56.0 43.0 -13.845540 0.215786 3.901893 535.072365 514.048222 147.0 0.044684 1803.0 1154.678314 es04feb12_1_2_10 -1.0
0 272.00 13.301786 272.00 13.301786 13.562496 1.078448 117.613923 0.144514 0.081919 20.470874 1.0 0.294090 598.0 1.169164 0.0 -1443.647863 51.0 59.464278 68.0 1627.022787 116.0 115.0 -21.964323 0.192050 14.593452 158.464169 282.278652 101.0 0.162192 1937.0 979.782137 es04feb12_1_2_11 -1.0
0 28.00 4.358899 28.00 4.358899 28.967240 4.642857 145.746209 0.179741 0.084282 25.096477 1.0 0.283303 918.0 1.436056 0.0 -751.235046 34.0 34.902512 28.0 916.768223 117.0 53.0 -8.240933 0.148997 106.967168 86.035406 126.522112 91.0 0.164513 8214.0 658.024836 es04feb12_1_2_12 1.0
0 43.75 6.123724 43.75 6.123724 16.491517 3.920049 63.582105 0.206621 0.073081 33.743488 1.0 0.298570 1028.0 1.450329 0.0 -1854.952889 39.0 48.542376 64.0 2123.447813 116.0 116.0 -14.980838 0.214417 23.869564 67.870324 105.331613 117.0 0.118233 3476.0 1333.102992 es04feb12_1_2_13 1.0
In [6]:
df.describe()
Out[6]:
dep_red dep_sd hyp_red hyp_sd graph_avg_speed graph_slowest_path graph_fastest_path geometrical_avg_shift geometrical_shift_sd geometrical_max_dist spatial_dispersion_count spatial_dispersion_sd da da_sd Channels contrast break_measure fwhm get_acc max_speed peak2peak trough2peak rise_coef smile_cry d_kl jump psd_center der_psd_center rise_time unif_dist num_spikes max_abs label
count 1063.000000 1063.000000 1063.000000 1063.000000 1063.000000 1063.000000 1063.000000 1063.000000 1063.000000 1063.000000 1063.000000 1063.000000 1063.000000 1063.000000 1063.000000 1063.000000 1063.000000 1063.000000 1063.000000 1063.000000 1063.000000 1063.000000 1063.000000 1063.000000 1063.000000 1063.000000 1063.000000 1063.000000 1063.000000 1.063000e+03 1063.000000 1063.000000
mean 628.031162 9.764127 423.294920 8.735876 21.080586 6.883599 105.876222 0.378737 0.344269 56.418367 2.405456 0.304798 1012.539981 1.172401 4.297912 -1584.402902 35.035748 41.226457 36.534337 1806.247124 101.688617 58.140169 -12.634168 0.229378 239.226058 119.953114 112.025357 139.673565 0.079918 3.647276e+04 1258.553004 -0.113829
std 2305.143697 14.913159 1499.272555 12.112884 14.168496 6.750139 59.281135 0.191845 0.530900 27.907055 1.319936 0.038801 529.668999 0.211951 34.494323 1126.937620 9.293126 437.239316 19.960566 1073.016626 28.731892 23.353746 12.949271 0.095259 179.428501 129.835298 133.004510 61.760409 0.111791 9.748708e+04 741.626245 0.956309
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.020859 0.018442 2.465271 1.000000 0.156309 124.000000 0.517692 0.000000 -7511.872638 15.000000 -9999.000000 7.000000 16.824444 10.000000 8.000000 -84.177516 0.077713 -117.249486 6.877197 3.472632 49.000000 -0.361332 4.290000e+02 20.546620 -3.000000
25% 10.187500 2.546444 10.125000 2.541833 10.623957 2.411846 58.118350 0.237632 0.134977 35.784426 1.000000 0.281145 696.000000 1.058399 0.000000 -2158.791261 30.000000 40.805642 22.000000 1102.444308 99.500000 42.000000 -17.380732 0.167505 105.402888 41.656845 25.367231 94.000000 0.004028 4.128000e+03 790.127821 -1.000000
50% 47.000000 4.924429 46.375000 4.910130 17.609260 5.024938 96.902804 0.363474 0.234468 53.842292 2.000000 0.304111 902.000000 1.200967 0.000000 -1346.885163 34.000000 57.415023 31.000000 1588.715455 117.000000 52.000000 -12.210957 0.207251 217.134766 80.605792 65.570565 125.000000 0.095493 9.865000e+03 1097.891450 -1.000000
75% 189.812500 9.581068 187.687500 9.570051 28.048278 8.933223 143.574079 0.484546 0.356825 71.738375 3.000000 0.327343 1182.000000 1.315380 0.000000 -804.605535 38.000000 79.180716 45.000000 2306.117919 117.000000 69.000000 -7.959136 0.263534 339.933126 132.118259 137.636853 175.000000 0.162358 2.417300e+04 1602.155878 1.000000
max 34548.250000 114.713502 19143.750000 85.540561 83.777180 60.299254 366.748329 1.855837 8.452481 140.667142 8.000000 0.421263 3256.000000 1.666536 541.596859 476.111022 91.000000 196.104579 123.000000 6832.274229 203.000000 117.000000 107.086444 0.985809 743.055806 600.747082 602.508346 415.000000 0.298575 1.526168e+06 4953.776048 1.000000
In [7]:
df = df.loc[df.label >= 0]
df = df.loc[df.d_kl != -INF]
In [8]:
df.describe()
Out[8]:
dep_red dep_sd hyp_red hyp_sd graph_avg_speed graph_slowest_path graph_fastest_path geometrical_avg_shift geometrical_shift_sd geometrical_max_dist spatial_dispersion_count spatial_dispersion_sd da da_sd Channels contrast break_measure fwhm get_acc max_speed peak2peak trough2peak rise_coef smile_cry d_kl jump psd_center der_psd_center rise_time unif_dist num_spikes max_abs label
count 530.000000 530.000000 530.000000 530.000000 530.000000 530.000000 530.000000 530.000000 530.000000 530.000000 530.000000 530.000000 530.000000 530.000000 530.000000 530.000000 530.000000 530.000000 530.000000 530.000000 530.000000 530.000000 530.000000 530.000000 530.000000 530.000000 530.000000 530.000000 530.000000 5.300000e+02 530.000000 530.000000
mean 361.081604 7.658351 250.441509 7.019369 22.048016 7.362424 109.435652 0.388699 0.328867 56.450173 2.341509 0.303413 932.535849 1.156497 1.981877 -1791.602323 33.728302 46.631937 36.196226 1997.128521 101.256604 58.498113 -13.660958 0.206866 271.730833 93.773239 83.975813 124.367925 0.107961 4.845652e+04 1384.816295 0.792453
std 1533.544371 11.362721 1001.498470 9.166846 13.019691 6.619013 59.355767 0.192545 0.510884 26.684352 1.275912 0.038972 456.345407 0.201883 11.178278 1142.791419 7.476602 437.948131 19.748302 1094.675370 29.712011 23.286353 10.798125 0.071769 175.055112 89.062560 96.581353 53.610463 0.098882 1.248243e+05 755.576197 0.405934
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.047929 0.018442 8.699463 1.000000 0.174545 130.000000 0.562012 0.000000 -7511.872638 18.000000 -9999.000000 10.000000 16.824444 10.000000 8.000000 -77.572713 0.077713 -71.559562 7.442545 3.534918 49.000000 -0.254774 6.980000e+02 21.514729 0.000000
25% 9.000000 2.384848 9.000000 2.372521 12.433902 2.690582 62.281733 0.237298 0.135322 37.151902 1.000000 0.277655 653.000000 1.039708 0.000000 -2316.092753 30.000000 45.325465 22.000000 1264.411507 113.250000 41.000000 -17.854637 0.150766 134.908088 38.019227 21.105335 80.000000 0.046329 6.363500e+03 862.782529 1.000000
50% 34.312500 4.262842 34.125000 4.225932 19.786739 5.742786 101.575195 0.372739 0.240819 53.958976 2.000000 0.300317 856.000000 1.175606 0.000000 -1550.039051 33.000000 62.376723 30.000000 1770.332448 117.000000 52.000000 -12.447052 0.198755 248.929871 76.188401 57.051944 117.000000 0.113020 1.319900e+04 1197.972815 1.000000
75% 130.500000 8.135570 127.750000 8.053518 28.961900 9.748466 146.573724 0.498573 0.361715 71.607345 3.000000 0.328254 1065.000000 1.287813 0.000000 -1007.877874 37.000000 85.567910 44.750000 2517.085157 117.000000 68.000000 -8.424648 0.235705 361.875435 118.221308 104.912136 153.000000 0.186693 2.929325e+04 1719.008969 1.000000
max 19719.250000 93.345795 16525.500000 78.887578 70.289427 50.717354 366.748329 1.640206 8.452481 140.667142 7.000000 0.421263 3256.000000 1.643654 145.254381 242.645403 85.000000 144.122825 114.000000 6832.274229 203.000000 117.000000 17.596043 0.681195 743.055806 555.472016 549.873485 361.000000 0.295213 1.526168e+06 4682.726671 1.000000
In [9]:
TARGET_COLUMN_NAME = 'label'
# Select all numerical features.
numerical_features = df.select_dtypes(["float64", "int64"])
plot_df = numerical_features.astype("float64")  # this is done to solve a problem in sns (see https://datascience.stackexchange.com/questions/55435/seaborn-violin-plot-error-no-loop-for-unfunc-add)

# Create distribution plots.
nrows = len(numerical_features.columns)
fig, ax = plt.subplots(nrows=nrows, ncols=2, figsize=(20, 40))
for i, feature in enumerate(numerical_features):
    sns.violinplot(x=TARGET_COLUMN_NAME, y=feature, data=plot_df, ax=ax[i, 0])
    if i == 0:
        ax[i, 0].set_title("Violin Plots")
        ax[i, 1].set_title("Box Plots")        
    sns.boxplot(x=TARGET_COLUMN_NAME, y=feature, data=plot_df, ax=ax[i, 1])
    ax[i, 0].set_xlabel("")
    ax[i, 1].set_xlabel("")
    ax[i, 1].set_ylabel("")
    ax[i, 0].set_ylabel(feature, rotation=45, labelpad=50)
_ = fig.text(0.6, 0, "cell type", ha='center')
_ = fig.suptitle("Numerical Feature Distributions", y=1, x=0.6)
fig.tight_layout()
In [10]:
correlation_matrix = df.corr()
fig, ax = plt.subplots(figsize=(22, 22))
_ = sns.heatmap(correlation_matrix, annot=True, fmt='.2f')
In [11]:
sns.pairplot(df, hue="label")
Out[11]:
<seaborn.axisgrid.PairGrid at 0x1fc1ce61280>
In [ ]: